#library(plyr)
#require(dplyr)
#require(tidyr)
#require(stringr)
#require(haven)
#require(survey)
#require(xlsx)
#require(ggplot2)
#options(OutDec= ".")
#options(stringsAsFactors = FALSE)
######################################
##### commands to remember
#macro like functions
#result <- eval(parse(text = paste("data",1)))
#exists('var')
#get('var')
#assign('var',4)
#paste
#dput(names(xtabs(~df$var)))
#clipboard
#read.table("clipboard", sep="\t",header = TRUE)
#write.table(data.frame, "clipboard", sep="\t", row.names=FALSE)
#readClipboard()
#writeClipboard(array)
#xlsx
# wb <- createWorkbook()
# wb <- loadWorkbook(file.xlsx)
# sheet <- createSheet(wb, sheetName='Resumo - Seg1')
# addDataFrame(df.seg, sheet,row.names=FALSE)
# saveWorkbook(wb, file=file.xlsx)
######################################
##### read spss data and labels
get_spss <- function(file){
#' Load spss data with labels converted to factors
#'
#' This function loads an spss .sav file, and creates a dataframe with the description,
#' type and labels of all variables in the data file. This function automatically applies
#' as_factor to the data, converting all variables with labels in spss to factors in R.
#'
#' To convert labels to factors function \code{haven::as_factor(.,only_labelled = TRUE,ordered=TRUE)} is used.
#' For details, see \code{?as_factor}.
#'
#' The dataframe with the description of all variables contains the following information:
#' \itemize{
#' \item \strong{var}: Variable name in SPSS and R
#' \item \strong{name}: Variable label in SPSS
#' \item \strong{spss.format}: Variable format in SPSS
#' \item \strong{class}: Variable class in R
#' \item \strong{n.na}: Number of \code{NA}'s
#' \item \strong{n.nan}: Number of \code{NaN}'s
#' \item \strong{n.distinct}: Number of distinct values
#' \item \strong{n.labels}: Number of labels
#' \item \strong{labels}: All labels concatenated and separated by ';'
#' \item \strong{labels.1 thru labels.n}: each label is displayed separatly in a variable
#' }
#'
#' @param file A characater string with the location of the spss file to be loaded.
#' @return A list with two components:
#' \itemize{
#' \item \strong{sav}\emph{(dataframe)}: with the actual data
#' \item \strong{vars}\emph{(dataframe)}: with the description of all variables
#' }
#'
#' @examples
#' spss.data <- get_spss(file)$sav
#'
#' spss <- get_spss(file)
#' spss.data <- spss$sav
#' spss.vars <- spss$vars
df.orig <- haven::read_sav(file)
########################################
#fix for NaN instead of NA problem
# num.vars <- unlist(lapply(df.orig,function(x){is.numeric(x)}))
# df.orig[,num.vars] <- lapply(df.orig[,num.vars],function(x){ x[is.nan(x)]<-NA;return(x)})
# df.orig <- as.data.frame(df.orig)
# char.vars <- unlist(lapply(df.orig,function(x){is.character(x) | is.factor(x)}))
# df.orig[,char.vars] <- lapply(df.orig[,char.vars],function(x){ x[x == 'NaN']<-NA;return(x)})
# df.orig <- as.data.frame(df.orig)
df.orig <- as.data.frame(df.orig %>% purrr::map(function(x){y <- ifelse(is.nan(x)==TRUE,NA,x);attributes(y) <- attributes(x);return(y)}))
########################################
########################################
vars <- data.frame(.id=names(df.orig))
#vars.labels <- plyr::ldply(df.orig,'attr',which="label")
vars.labels <- as.data.frame(purrr::map_chr(df.orig,~ifelse(length(attr(.,which="label")) == 0,"",attr(.,which="label"))))
formato <- plyr::ldply(df.orig,'attr',which="format.spss")
na <- plyr::ldply(df.orig,function(x){sum(is.na(x))})
nan <- plyr::ldply(df.orig,function(x){sum(is.nan(x))})
df <- df.orig
df <- haven::as_factor(df,only_labelled = TRUE)
n.labels <- plyr::ldply(df,function(x){length(levels(x))})
labels <- plyr::ldply(df,function(x){paste(levels(x),collapse="; ")})
distinct <- plyr::ldply(df,function(x){length(xtabs(~x,na.action = na.pass,drop.unused.levels = TRUE))})
n <- max(as.numeric(n.labels$V1))
# #convertendo de volta para string
# if (getOption("stringsAsFactors") == FALSE){
# char.vars <- str_detect(formato$attr,"^A")
# df[,char.vars] <- sapply(df[,char.vars],as.character)
# }
classe <- plyr::ldply(df,function(x){paste(class(x),collapse="-")})
df.vars <- cbind(vars,vars.labels)
df.vars <- dplyr::left_join(df.vars,formato,by=".id")
df.vars <- dplyr::left_join(df.vars,classe,by=".id")
df.vars <- dplyr::left_join(df.vars,na,by=".id")
df.vars <- dplyr::left_join(df.vars,nan,by=".id")
df.vars <- dplyr::left_join(df.vars,distinct,by=".id")
df.vars <- dplyr::left_join(df.vars,n.labels,by=".id")
df.vars <- dplyr::left_join(df.vars,labels,by=".id")
names(df.vars) <- c('var','name','spss.format','class','n.na','n.nan','n.distinct','n.labels','labels')
df.vars[,paste0('label.',1:n)] <- stringr::str_split_fixed(df.vars$labels,'; ',n)
return(list(sav=df,vars=df.vars))
}
write_spss <- function(df,file,max=255){
#' Save spss data
#'
#' This function saves an spss '.sav' file removing known erros. String with length 0 are tranformed to " ". Also,
#' all character string are truncated to max length of paramenter 'max' and [[:cntrl:]] string are removed.
#'
#' @param df The dataframe to be exported to sav format.
#' @param file A characater string with the location of the spss file to be saved.
#' @param max Maximum size of character strings.
#' @return NULL
#'
#' @examples
#' spss.data <- write_sav(df,file)
df <- as.data.frame(purrr::map(df,function(x){
z <- x
if(is.character(x) == TRUE){
z <- stringr::str_replace_all(stringr::str_trunc(z,max),'[[:cntrl:]]','')
if (max(nchar(as.character(x)))==0){
z <- rep(" ",length(x))
}
}
attributes(z) <- attributes(x)
return(z)
}))
ret <- haven::write_sav(df,file)
return(ret)
}
######################################
##### dplyr::summarise dataframe
df_summary <- function(df,drop=FALSE){
#' Summarises all variables in a dataframe
#'
#' This function calculates the following statistics for all variables in the dataframe:
#' \itemize{
#' \item \strong{var}: Variable name
#' \item \strong{name}: Variable label
#' \item \strong{class}: Variable class
#' \item \strong{n.na}: Number of \code{NA}'s
#' \item \strong{n.nan}: Number of \code{NaN}'s
#' \item \strong{n.distinct}: Number of distinct values
#' \item \strong{n.labels}: Number of labels
#' \item \strong{labels}: All labels concatenated and separated by ';'
#' \item \strong{labels.1 thru labels.n}: each label is displayed separatly in a variable
#' }
#'
#' @param df The dataframe to be summarised.
#' @param drop If \code{droplevels(df)} should be run before summarizing the dataframe.
#' This can make the orginal dataframe and the one analyzied differ. We
#' recomend you run \code{droplevels(df)} before calling this function if needed.
#' @return A \emph{(dataframe)} with the description of all variables.
#'
#' @examples
#' summary <- df_summary(df)
#'
if (drop == TRUE){
df <- droplevels(df)
warning('Function droplevels was used. Analised DF may be different than you expect!')
}
vars <- plyr::ldply(df,'attr',which="label")
classe <- plyr::ldply(df,function(x){paste(class(x),collapse="-")})
na <- plyr::ldply(df,function(x){sum(is.na(x))})
nan <- plyr::ldply(df,function(x){sum(is.nan(x))})
n.labels <- plyr::ldply(df,function(x){length(levels(x))})
labels <- plyr::ldply(df,function(x){paste(levels(x),collapse=";")})
distinct <- plyr::ldply(df,function(x){length(xtabs(~x,na.action = na.pass))})
n <- max(as.numeric(n.labels$V1))
df.vars <- as.data.frame(classe)
if (dim(vars)[1] > 0){df.vars <- dplyr::left_join(df.vars,vars,by=".id")}
df.vars <- dplyr::left_join(df.vars,na,by=".id")
df.vars <- dplyr::left_join(df.vars,nan,by=".id")
df.vars <- dplyr::left_join(df.vars,distinct,by=".id")
df.vars <- dplyr::left_join(df.vars,n.labels,by=".id")
df.vars <- dplyr::left_join(df.vars,labels,by=".id")
if (dim(vars)[1] > 0){
names(df.vars) <- c('name','class','var','n.na','n.nan','n.distinct','n.labels','labels')
} else {
names(df.vars) <- c('name','class','n.na','n.nan','n.distinct','n.labels','labels')
}
if (n > 0){
df.vars[,paste0('label.',1:n)] <- stringr::str_split_fixed(df.vars$labels,';',n)
}
return(NULL)
}
######################################
##### recode functions
dummy_all <- function(df=NULL,reg.exp=NULL,keep_all=TRUE){
#' Creates an indicador variable for each category of the selected variables
#'
#' This function creates an indicador variable for each category of the selected variables
#' that belong to classes \emph{factor} or \emph{character}. Variables are selected using a
#' regular expression. The user can choose to keep all categories or to drop the reference
#' category. The new indicathor variables are named with the category label/value - only
#' alpha-numerical characters are kept, and all accents are removed. Empty categories (i.e.
#' factor levels that are defined but not used) are dropped.
#'
#' @param df The dataframe containing the variables to be recoded.
#' @param reg.exp A \emph{regular expression} identifying the variables that should be analysed.
#' @param keep_all If \code{TRUE}, then for every category a dummy variable will be created. If
#' \code{FALSE} then the reference category will be dropped.
#' @return A \emph{(dataframe)} contaning only the dummy variables that were created.
#'
#' @examples
#' df.dummies <- dummy_all(df,reg.exp='^P[0-9]+$',keep_all=TRUE)
#'
#dplyr::selecting variables
df <- df %>% dplyr::select(matches(reg.exp))
inds <- sapply(df,class) %in% c("factor","character")
df <- df[,inds]
if (sum(inds) > 0) warning("Only Factor and Character variables are kept!!!")
#creating names
df.names <- df %>% tidyr::gather(var,categ)
df.names <- df.names %>% dplyr::group_by(var,categ) %>% dplyr::summarise(freq=n())
df.names <- df.names %>% dplyr::group_by(var) %>% dplyr::mutate(n=row_number())
df.names$orig <- paste0(df.names$var,df.names$categ)
df.names$orig <- iconv(df.names$orig,from='UTF-8', to='ASCII//TRANSLIT')
df.names$name <- str_replace_all(df.names$categ,'[^[:alnum:]]','.')
df.names$name <- str_replace_all(df.names$name,'\\.+','.')
df.names$name <- str_replace_all(df.names$name,'\\.$','')
df.names$name <- iconv(df.names$name,from='UTF-8', to='ASCII//TRANSLIT')
df.names$new <- paste0(df.names$var,".",df.names$n,"_",df.names$name)
df.names <- df.names %>% ungroup() %>% dplyr::select(orig,new)
#creating dataframe of dummies
if (keep_all==TRUE){
form <- paste0("~ -1 + ",paste(names(df),collapse = " + "))
form <- as.formula(form)
df.dummy <- as.data.frame(model.matrix(form,df,contrasts.arg = lapply(df, contrasts, contrasts=FALSE)))
} else {
form <- paste0("~ ",paste(names(df),collapse = " + "))
form <- as.formula(form)
df.dummy <- as.data.frame(model.matrix(form,df))
df.dummy <- df.dummy[,-1]
}
#dropping empty variables
inds <- unname(colSums(df.dummy,na.rm = TRUE) > 0)
df.dummy <- df.dummy[,inds]
names.orig <- iconv(names(df.dummy),to='ASCII//TRANSLIT')
df.merge <- data.frame(orig=names.orig,order=1:length(names.orig))
df.merge <- dplyr::left_join(df.merge,df.names,by="orig")
df.merge <- df.merge %>% dplyr::arrange(order)
if(sum(is.na(df.merge$new)) > 0) stop("Something went wrong with the df labels!")
names(df.dummy) <- df.merge$new
return(df.dummy)
}
recode_all <- function(df=NULL){
#' space holder
}
######################################
##### segmentation - hierarquical + kmeans
#atualizar com o purrr e o broom
#https://cran.r-project.org/web/packages/broom/vignettes/kmeans.html
auto_cluster <- function(df=NULL,grps=3:6,name='kmeans',iter.max=100){
#' Cluster Analysis combining Hierarquical and Kmeans cluster
#'
#' This function runs as Cluster Analysis. The first step is to run a Hierarquical Cluster,
#' and then use centroids as starting point for the Kmeans Cluster. This function outputs the
#' cluster ids for each line in the dataframe and a summary of the group sizes. Details of the
#' analysis for each step are:
#' \itemize{
#' \item \strong{Hierarquical}: Uses Euclidean distance and Ward's method. See more details
#' in \code{\link[stats]{hclust}}.
#' \item \strong{Kmeans}: Uses default options. See more details in \code{\link[stats]{kmeans}}.
#' }
#'
#' @param df A \emph{dataframe} containing the variables to be used in the analysis.
#' @param grps An \emph{array} or \emph{number} with the number of groups that should be created.
#' @param name A \emph{string} with the name of the variables that will be created.
#' @param iter.max A \emph{number} indicating the maximum number of iterations for the Kmeans cluster.
#' @return A list with two components:
#' \itemize{
#' \item \strong{grps}\emph{(dataframe)}: with the variables identifying the cluster each observation
#' belongs too.
#' \item \strong{vars}\emph{(dataframe)}: summary of the number of observations per cluster.
#' }
#' @examples
#' df.cluster <- auto_cluster(df=df,grps=3:6,name='kmeans')
#'
df_ <- df
d <- dist(df, method = "euclidean")
hrq <- hclust(d, method="ward")
for (i in grps){
grp <- cutree(hrq, k=i)
aux.df <- cbind(grp,df)
aux.df <- aux.df %>% dplyr::group_by(grp) %>% dplyr::summarise_each(funs(mean))
aux <- as.matrix(aux.df[,-1])
#Cluster kmeans
df_[,paste0(name,i)] <- kmeans(df, aux, iter.max = iter.max)$cluster
}
df_ <- df_ %>% dplyr::select(starts_with(name))
df.seg <- df_ %>% tidyr::gather(var,seg) %>% dplyr::group_by(var,seg) %>% dplyr::summarise(n=n())
df.seg <- as.data.frame(df.seg %>% tidyr::spread(var,n))
return(list(grps=df_,summary=df.seg))
}
######################################
##### Factor analysis
factor_analysis <- function(df=NULL,n.fat=NULL,name="fator",sep=".",rotation="varimax",scores="regression",cut=0.2){
#' Factor Analysis choosing number of factors automatically
#'
#' This function runs as Factor Analysis. The first step is to calculate the number of factors
#' automatically. Once the number of factor is choosen, the Factor Analysis is run, and both
#' the estimated factor variables and the loadings are calculated. Missing values aren't aloud. Details
#' of the analysis for each step are:
#' \itemize{
#' \item \strong{# of Factors}: Calculates the number of eigenvalues in the correlation matrix that
#' are larger then 1. The idea is that if the variance of a standardized variable is 1, then a factor
#' should only be maintained if it's variance is at least one. See more details in \code{\link[base]{eigen}}.
#' \item \strong{Factor Analysis}: The default options are \code{rotation='varimax'} and
#' \code{scores='regression'}. See more details in \code{\link[stats]{factanal}}.
#' }
#'
#' @param df A \emph{dataframe} containing the variables to be used in the analysis.
#' @param n.fat A \emph{number} with the number of factors to use. if \emph{NULL} then the
#' number of factors is calculated automatically.
#' @param name A \emph{string} with the name of the factor variables that will be created.
#' @param sep A \emph{string} with the character to be used separating name of the factor
#' from the number of the factor
#' @param rotation A \emph{string} identifying the type of rotation to be performed.
#' @param scores A \emph{string} identifying the type of scores to be extracted.
#' @param cut A \emph{number} indicating the minimum size of factor loadings that should be kept.
#' Loadings smaller then \emph{cut} will be dropped.
#' @return A list with two components:
#' \itemize{
#' \item \strong{factors}\emph{(dataframe)}: with the variables with the scores extrated from
#' the factor analysis.
#' \item \strong{loadings}\emph{(dataframe)}: with the extracted factor loadings.
#' \item \strong{details}\emph{(list)}: with all of the information returned from factanal.
#' }
#' @examples
#' df_factan <- factor_analysis(df=df,cut=0.2)
#'
#number of factors
if (is.null(n.fat)){
n.fat <- eigen(cor(df,use = "pairwise.complete.obs"))
n.fat <- sum(n.fat$values >= 1)
}
#if there are missings
#cov.mat <- cov(df[,-1],use = "pairwise.complete.obs")
#fit <- factanal(x=df[,-1], factors=n.fat, covmat=cov.mat, rotation="varimax",scores="regression")
fit <- factanal(x=df, factors=n.fat, rotation=rotation,scores=scores)
df.scores <- as.data.frame(fit$scores)
names(df.scores) <- paste0(name,sep,1:n.fat)
load <- as.data.frame(fit$loadings[])
names(load) <- paste0(name,sep,1:n.fat)
load$fator <- apply(abs(load),1,which.max)
load$max <- apply(abs(load[,-dim(load)[2]]),1,max)
load$var <- row.names(load)
load <- load %>% tidyr::gather(factor,loading,-var,-fator,-max)
load$loading <- ifelse(abs(load$loading) <= cut,NA,load$loading)
load <- load %>% tidyr::spread(factor,loading)
load <- load %>% dplyr::arrange(fator,-max)
load <- load[,c('var','fator',paste0(name,sep,1:n.fat))]
return(list(factors=df.scores,loadings=load,details=fit))
}
######################################
##### dplyr::summarise with totals / margin
tab_summary <- function(df=NULL,reg.exp_lin=NULL,reg.exp_col=NULL,wgt=NULL){
#' Summary table is created, automatically adding marginal
#'
#' This function is still incomplete, for now it only does counts and column total. NEED TO UPDATE!
#'
#' @param df A \emph{dataframe} containing the variables to be used in the analysis.
#' @param reg.exp_lin A \emph{regular expression} identifying the variables that should be used
#' to create the \emph{lines} of the table.
#' @param reg.exp_col A \emph{regular expression} identifying the variables that should be used
#' to create the \emph{columns} of the table.
#' @param wgt A \emph{string} identifying the weight variable. If \emph{NULL} weights are ignored.
#' @return A \emph{(dataframe)} with the summary table.
#' @examples
#' df.tab <- tab_summary(df=df,reg.exp_lin='^P1[0-4]$',reg.exp_col='^P2[6-9]$',wgt=NULL)
#'
if (is.null(wgt)){
df$peso <- 1
} else {
df$peso <- df[,wgt]
}
df.tab <- df %>% dplyr::select(peso,matches(reg.exp_lin),matches(reg.exp_col)) %>% tidyr::gather(var,categ,matches(reg.exp_lin)) %>% tidyr::gather(seg,grp,matches(reg.exp_col))
df.tab <- df.tab %>% dplyr::group_by(seg,grp,var,categ) %>% dplyr::summarise(freq=sum(peso,na.rm = TRUE))
df.tab <- df.tab %>% dplyr::group_by(seg,grp,var) %>% dplyr::mutate(freq=round(100*freq/sum(freq),1))
df.tab <- df.tab %>% tidyr::unite(grp,seg,grp)
df.tab <- df.tab %>% tidyr::spread(grp,freq)
df.tot.lin <- df %>% dplyr::select(peso,matches(reg.exp_lin)) %>% tidyr::gather(var,categ,matches(reg.exp_lin))
df.tot.lin <- df.tot.lin %>% dplyr::group_by(var,categ) %>% dplyr::summarise(total=sum(peso,na.rm = TRUE))
df.tot.lin <- df.tot.lin %>% dplyr::group_by(var) %>% dplyr::mutate(total=round(100*total/sum(total),1))
df.tab <- dplyr::left_join(df.tot.lin,df.tab,by=c("var","categ"))
return(df.tab)
}
######################################
##### Raking fix
######################################
##### Soft merge - usando agrep e adist
######################################
##### Funções pra gerar apresentações
##### outro arquivo
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.